cd "/Users/TaihuaLi/Desktop/DePaul/CSC478 Programming Data Mining Apps/Homework/Homework 2/newsgroups"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
def KNN_Classifier(instance, dat, label, k, measure):
if measure == "euclidean":
dists = np.sqrt(((dat-instance)**2).sum(axis=1))
elif measure == "cosine":
dat_norm = np.array([np.linalg.norm(dat[j]) for j in range(len(dat))])
instance_norm = np.linalg.norm(instance)
sims = np.dot(dat, instance)/(dat_norm*instance_norm)
dists = 1 - sims
idx = np.argsort(dists)
neighbor_index = idx[:k]
neighbor_record = dat[[neighbor_index]]
labels = label[[neighbor_index]]
final_class = np.bincount(labels)
return np.argmax(final_class), idx[:k]
def Comp_Accuracy(testdata, testlabel, traindata, trainlabel, k, measure):
correct = 0
for i in range(testdata.shape[0]):
pred_class = KNN_Classifier(testdata[i], traindata, trainlabel, k, measure)
if pred_class[0] == testlabel[i,:][1]:
correct += 1
accuracy_rate = float(correct)/float(testdata.shape[0])
return accuracy_rate
terms = np.genfromtxt("modifiedterms.txt", dtype=str)
TrainMatrix_TD = np.genfromtxt("trainMatrixModified.txt", delimiter="\t")
TrainMatrix_DT = TrainMatrix_TD.T
TrainClass = np.loadtxt("trainClasses.txt", delimiter="\t", dtype=int)
TestMatrix_TD = np.genfromtxt("testMatrixModified.txt", delimiter="\t")
TestMatrix_DT = TestMatrix_TD.T
TestClass = np.loadtxt("testClasses.txt", delimiter="\t", dtype=int)
Euclidean distance, k=(1, 20)
euc_accuracy = []
for i in range(1, 21):
result = Comp_Accuracy(TestMatrix_DT, TestClass, TrainMatrix_DT, TrainClass[:,1], i, "euclidean")
euc_accuracy.append(result)
print euc_accuracy
cosine similarity, k=(1, 20)
cos_accuracy = []
for i in range(1, 21):
result = Comp_Accuracy(TestMatrix_DT, TestClass, TrainMatrix_DT, TrainClass[:,1], i, "cosine")
cos_accuracy.append(result)
print cos_accuracy
k = np.array(range(1, 21))
euc = np.array(euc_accuracy)
cos = np.array(cos_accuracy)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Comparison of KNN Classifier Performance Using Between \n Euclidean Distance and Cosine Similarity")
ax1.set_xlabel('Number of Neighbors (k)')
ax1.set_ylabel('Accuracy Rate')
plt.plot(k, cos, 'bs', label='Cosine Similarity')
plt.plot(k, euc, 'r--', label='Euclidean Distance')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
numTerms = len(TrainMatrix_DT[0])
train_numDocs = TrainMatrix_DT.shape[0]
test_numDocs = TestMatrix_DT.shape[0]
print numTerms, train_numDocs, test_numDocs
Doc_freq_train = np.array([(TrainMatrix_TD!=0).sum(1)]).T
TrainNMatrix = np.ones(np.shape(TrainMatrix_TD), dtype=float)*train_numDocs
TestNMatrix = np.ones(np.shape(TestMatrix_TD), dtype=float)*train_numDocs
train_IDF = np.log2(np.divide(TrainNMatrix, Doc_freq_train))
test_IDF = np.log2(np.divide(TestNMatrix, Doc_freq_train))
train_TFIDF = TrainMatrix_TD * train_IDF
test_TFIDF = TestMatrix_TD * test_IDF
NewTrainDT = train_TFIDF.T
NewTestDT = test_TFIDF.T
new_euc_accuracy = []
for i in range(1, 21):
result = Comp_Accuracy(NewTestDT, TestClass, NewTrainDT, TrainClass[:,1], i, "euclidean")
new_euc_accuracy.append(result)
print new_euc_accuracy
new_cos_accuracy = []
for i in range(1, 21):
result = Comp_Accuracy(NewTestDT, TestClass, NewTrainDT, TrainClass[:,1], i, "cosine")
new_cos_accuracy.append(result)
print new_cos_accuracy
new_euc = np.array(new_euc_accuracy)
new_cos = np.array(new_cos_accuracy)
fig = plt.figure(figsize=(10,4))
ax2 = fig.add_subplot(121)
ax2.set_title("Comparison of KNN Classifier Performance Using Between \n Euclidean Distance and Cosine Similarity \n without TFxIDF Weights")
ax2.set_xlabel('Number of Neighbors (k)')
ax2.set_ylabel('Accuracy Rate')
plt.plot(k, cos, 'bs', label='Cosine Similarity')
plt.plot(k, euc, 'r--', label='Euclidean Distance')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Comparison of KNN Classifier Performance Using Between \n Euclidean Distance and Cosine Similarity \n with TFxIDF Weights")
ax1.set_xlabel('Number of Neighbors (k)')
ax1.set_ylabel('Accuracy Rate')
plt.plot(k, new_cos, 'bs', label='Cosine Similarity')
plt.plot(k, new_euc, 'r--', label='Euclidean Distance')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
As shown above, the accuracies from measures between cosine similarity and euclidean distance deviate significantly in the first graph, which is a result of using the raw count data for the KNN classifier. In the second graph, with TFxIDF data, the accuracies of both measures still differ significantly. However, the accuracies for cosine similarity improves a lot. When K=10, the accuracy of using cosine similarity reaches 0.995 and as k increases, the accuracy fluctuates between 0.99 and 0.995. For the euclidean distance measure, the accuracy in general is actually worse than previously. When K=5, the accuracy of using euclidean distance measurement is 0.8, which is the highest among all the k values tested.
def Rocchio_Classifier(training, labels, instance):
x = np.column_stack((training, labels.T))
# prototypes for each class
class_one = np.array(np.zeros(training.shape[1]+1))
class_zero = np.array(np.zeros(training.shape[1]+1))
NumOnes = 0
NumZeros = 0
for i in x:
if i[-1] == 1:
class_one += i
NumOnes += 1
else:
class_zero += i
NumZeros += 1
# cosine similarity between prototypes & instance
class_one_norm = np.linalg.norm(class_one)
class_zero_norm = np.linalg.norm(class_zero)
instance_norm = np.linalg.norm(instance)
cos_sims_one = np.dot(class_one[:len(class_one)-1], instance)/(class_one_norm*instance_norm)
cos_sims_zero = np.dot(class_zero[:len(class_zero)-1], instance)/(class_zero_norm*instance_norm)
if cos_sims_one >= cos_sims_zero:
return 1, cos_sims_one, cos_sims_zero
else:
return 0, cos_sims_one, cos_sims_zero
def Rocchio_Accuracy(training, trainlabel, testing, testlabel):
correct = 0
for i in range(testing.shape[0]):
pred_class = Rocchio_Classifier(training, trainlabel, testing[i])
if pred_class[0] == testlabel[i,:][1]:
correct += 1
accuracy_rate = float(correct)/float(testing.shape[0])
print "The accuracy rate is %f." % accuracy_rate
Rocchio_Accuracy(NewTrainDT, TrainClass[:,1], NewTestDT, TestClass)
Comparing the accuracy of the Rocchio method with K-Nearest Neighbor algorithm on the TFxIDF data, overall, the KNN algorithm with cosine similarity measure outputs better accuracies as its higest accuracy is 0.995.
cd "/Users/TaihuaLi/Desktop/DePaul/CSC478 Programming Data Mining Apps/Homework/Homework 2"
from sklearn.feature_extraction import DictVectorizer
from sklearn import neighbors, tree, naive_bayes
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
dat = pd.read_csv("bank_data.csv", sep=',', index_col=0, header=0)
dat.head(5)
labels = dat['pep']
dat = dat.drop('pep', axis=1)
dat = pd.get_dummies(dat)
dat = np.array(dat)
labels = np.array(labels)
dat_train, dat_test, dat_target_train, dat_target_test = train_test_split(dat, labels, test_size=0.2, random_state=33)
print dat_train.shape, dat_test.shape
min_max_scaler = preprocessing.MinMaxScaler().fit(dat_train)
dat_train_norm = min_max_scaler.transform(dat_train)
dat_test_norm = min_max_scaler.transform(dat_test)
n_neighbors = 17
knnclf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
knnclf.fit(dat_train_norm, dat_target_train)
knnpreds_test = knnclf.predict(dat_test_norm)
knncm = confusion_matrix(dat_target_test, knnpreds_test)
knn_train_score = knnclf.score(dat_train_norm, dat_target_train)
knn_test_score = knnclf.score(dat_test_norm, dat_target_test)
print classification_report(dat_target_test, knnpreds_test),
print "\n Confusion Matrix \n",knncm, "\n"
print "Average accuracy is", (knn_train_score+knn_test_score)/2
print "Training score is %f" % knn_train_score
print "Testing score is %f" % knn_test_score
plt.matshow(knncm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
As shown above, from experiments with k between 1 and 20 and with weights parameters between "distance" and "uniform", the accuracy is the highest when k=17 with the weights parameter is "uniform". As shown above, the accuracy of the algorithm is 0.7.
Decision Tree
treeclf = tree.DecisionTreeClassifier()
treeclf = treeclf.fit(dat_train, dat_target_train)
treepreds_test = treeclf.predict(dat_test)
treecm = confusion_matrix(dat_target_test, treepreds_test)
tree_train_score = treeclf.score(dat_train, dat_target_train)
tree_test_score = treeclf.score(dat_test, dat_target_test)
print classification_report(dat_target_test, treepreds_test)
print "\n Confusion Matrix \n", treecm, "\n"
print "Average accuracy is", (tree_train_score+tree_test_score)/2
print "Training score is %f" % tree_train_score
print "Testing score is %f" % tree_test_score
plt.matshow(treecm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
Naive Bayes
nbclf = naive_bayes.GaussianNB()
nbclf = nbclf.fit(dat_train, dat_target_train)
nbpreds_test = nbclf.predict(dat_test)
nbcm = confusion_matrix(dat_target_test, nbpreds_test)
nb_train_score = nbclf.score(dat_train, dat_target_train)
nb_test_score = nbclf.score(dat_test, dat_target_test)
print classification_report(dat_target_test, nbpreds_test)
print "\n Confusion Matrix \n", nbcm, "\n"
print "Average accuracy is", (nb_train_score+nb_test_score)/2
print "Training score is %f" % nb_train_score
print "Testing score is %f" % nb_test_score
plt.matshow(nbcm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
As shown above, the decision tree classifier produces the highest accuracy among the three classifiers. However, the difference between the decision tree score on training and testing data set is 0.2, which is greater than 0.1, it indicates an overfitted model. Therefore, we can only compare between the KNN and Naive Bayes classifiers. Between these two classifiers, KNN classifier has a higher accuracy rate of 0.7. Also the difference of KNN classifier scores between training and testing data set is a lot smaller than that from the Naive Bayes classifiers. Therefore, for this data set, K-nearest neighbor classifier is the best algorithm.
cs_dat = pd.read_csv("adult-modified.csv", sep=',', header=0, na_values=["?"])
cs_dat.head(5)
cs_dat.describe(include="all")
cs_dat[cs_dat.age.isnull()]
cs_dat[cs_dat.workclass.isnull()]
cs_dat[cs_dat.education.isnull()]
cs_dat[cs_dat["marital-status"].isnull()]
cs_dat[cs_dat.race.isnull()]
cs_dat[cs_dat.sex.isnull()]
cs_dat[cs_dat["hours-per-week"].isnull()]
cs_dat[cs_dat.income.isnull()]
As shown above, age has 198 missing values and workclass has 588 missing values. We need to fill in missing values for age with its column mean and remove all the instances where workclass is missing.
age_mean = cs_dat.age.mean()
cs_dat.age.fillna(age_mean, axis=0, inplace=True)
cs_dat[cs_dat.age.isnull()]
All the missing values for age attribute have been filled.
cs_dat.dropna(axis=0, inplace=True)
cs_dat[cs_dat.workclass.isnull()]
All the instances where workclass attribute is missing have been removed.
cs_dat.describe(include="all")
Below are some visualizations showing the distribution of all variables
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Age Distribution")
ax1.set_xlabel('Age')
ax1.set_ylabel('Frequency')
cs_dat.age.plot(kind='hist', grid=True)
A shown above, the distribution of age is skewed to the right. For the age attribute, the mean is 38 years old and the median is 37 years old while the attribute ranges from 17 to 90 years old.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Workclass Distribution")
ax1.set_xlabel('Workclass')
ax1.set_ylabel('Count')
cs_dat.workclass.value_counts().plot(kind='bar', grid=True)
For the workclass attribute, we have mostly private work class, which takes up 6974 out of 9412 instances.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Education Distribution")
ax1.set_xlabel('Education')
ax1.set_ylabel('Frequency')
cs_dat.education.plot(kind='hist', grid=True)
For the years of education attribute, most of the instances are between 8 and 12 years and the distribution of the data is skewed to the left. The mean value is 10 years of education, and the median value is 10 years while the attribute data ranges from 1 to 16 years.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Marital Status Distribution")
ax1.set_xlabel('Marital Status')
ax1.set_ylabel('Count')
cs_dat['marital-status'].value_counts().plot(kind='bar', grid=True)
For the marital status, the distribution between married and single is almost even with the number of married status being slightly greater than the single status. The married status has 4737 instances.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Race Distribution")
ax1.set_xlabel('Race')
ax1.set_ylabel('Count')
cs_dat.race.value_counts().plot(kind='bar', grid=True)
As shown above, most of the instances' race is White, which has 8062 instances.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Sex Distribution")
ax1.set_xlabel('Sex')
ax1.set_ylabel('Count')
cs_dat.sex.value_counts().plot(kind='bar', grid=True)
As shown above, the data set has 6383 male and 3092 female.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Hours Per Week Distribution")
ax1.set_xlabel('Hours Per Week')
ax1.set_ylabel('Frequency')
cs_dat['hours-per-week'].plot(kind='hist', grid=True)
As shown above, most of instances have a working hour per week between 30 and 40 hours. The mean of the attribute is 41 hours and the median is 40 hours with the attribute values range between 1 and 99 hours.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Income Distribution")
ax1.set_xlabel('Income')
ax1.set_ylabel('Frequency')
cs_dat.income.value_counts().plot(kind='bar', grid=True)
As shown above, most of the instances' income is less or equal to 50K, which has a total of 7093 counts.
Below are the cross tabulation results and their corresponding bar charts.
ct1 = pd.crosstab(cs_dat.education, cs_dat.race)
ct1
plt.show(ct1.plot(kind = "bar", title = "Education Distribution With Different Races"))
As shown above, in general for each race, most of observations' years of education fall into 9 years as this bin has the most values in all the graphs above. However, for Asian, about 75 instances have 13 years of education, which is the mode in the data set. For Black, Hispanic and White, they all have a mode of 9 years of education, followed by 10 years of education as the second most popular value and 13 years as the third most popular value. For American Indian, the mode value is 9 years, followed by 1o years as the second most popular value and 6 years as the third most popular value.
ct2 = pd.crosstab(cs_dat.workclass, cs_dat.income)
ct2
plt.show(ct2.plot(kind = "bar", title = "Work Class Distribution With Different Income Levels"))
As shown above, in the lower-income class, most of the people are private working class. In the higher income class, where the income is greater than 50K, most of the people are private working class as well. However, in the lower income class, the second most popular working class is public while in the higher income class, the second most popular working class is self employed.
ct3 = pd.crosstab(cs_dat.race, cs_dat.workclass)
ct3
plt.show(ct3.plot(kind = "bar", title = "Race Distribution With Different Work Class"))
As shown above, for each working class, White is the dominating race for all since most of the instances in the data set are White. However, most of the White race have private work class. For private and public working classes, the second most popular race is Black, followed by Asian to be the third. For self-employed working class, Asian is the second most popular race while Black is the third most popular class.
ct4 = pd.crosstab(cs_dat.race, cs_dat.income)
ct4
plt.show(ct4.plot(kind = "bar", title = "Race Distribution With Different Income Levels"))
pd.crosstab(cs_dat.race, cs_dat.income).apply(lambda r: r/r.sum(), axis=1)
As shown above in the graphs and the table indicating percentages of each race in either lower or upper income class, all the races have more percentage points in the lower income class than the higher income class. However, Hispanic and American Indian have over 90% of instances in the lower income class. Asian have almost a quarter of instances (23%) in the upper income class while White have slightly over a quarter of instances (26.27%) in the upper income class. For black, about 13.34% of instances are in the upper income class.
The following is a comparison between higher and lower income classes
lower_income = cs_dat[cs_dat.income=="<=50K"]
higher_income = cs_dat[cs_dat.income==">50K"]
lower_income.describe(include="all")
higher_income.describe(include="all")
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Age Distribution of Lower Income Class")
ax1.set_xlabel('Age')
ax1.set_ylabel('Frequency')
lower_income.age.plot(kind='hist', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Age Distribution of Upper Income Class")
ax1.set_xlabel('Age')
ax1.set_ylabel('Frequency')
higher_income.age.plot(kind='hist', grid=True)
As shown above, for the lower income class, most of people are between 15 and 45 years. For the upper income class, most of people are between 30 and 50 years old. The distribution of age for the lower income class is skewed to the right while the distribution of age for the upper income class is almost normally distributed.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Work Class Distribution of Lower Income Class")
ax1.set_xlabel('Work Class')
ax1.set_ylabel('Frequency')
lower_income.workclass.value_counts().plot(kind='bar', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Work Class Distribution of Upper Income Class")
ax1.set_xlabel('Work Class')
ax1.set_ylabel('Frequency')
higher_income.workclass.value_counts().plot(kind='bar', grid=True)
As shown above, private working class dominates in both income classes. Public working class is the second most popular value for the lower income class and the third most popular value for the upper income class.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Education Distribution of Lower Income Class")
ax1.set_xlabel('Education')
ax1.set_ylabel('Frequency')
lower_income.education.plot(kind='hist', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Education Distribution of Upper Income Class")
ax1.set_xlabel('Education')
ax1.set_ylabel('Frequency')
higher_income.education.plot(kind='hist', grid=True)
As shown above, in general, people in the upper income class have more years of education than those in the lower income class. For the lower income class, most of people have years of education between 8 and 12 years. For the upper income class, most of people have years of education around 9 and 10 years. The proportion of people having 12 more years of education in the upper income class outnumbered that in the lower income class.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Marital Status Distribution of Lower Income Class")
ax1.set_xlabel('Marital Status')
ax1.set_ylabel('Frequency')
lower_income["marital-status"].value_counts().plot(kind='bar', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Marital Status Distribution of Upper Income Class")
ax1.set_xlabel('Marital Status')
ax1.set_ylabel('Frequency')
higher_income["marital-status"].value_counts().plot(kind='bar', grid=True)
As shown above, most of people in the lower income class are single while most of people in the upper income class are married.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Race Distribution of Lower Income Class")
ax1.set_xlabel('Race')
ax1.set_ylabel('Frequency')
lower_income.race.value_counts().plot(kind='bar', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Race Distribution of Upper Income Class")
ax1.set_xlabel('Race')
ax1.set_ylabel('Frequency')
higher_income.race.value_counts().plot(kind='bar', grid=True)
Overall, for both income class, White is the most popular race, followed by Black as the second, Asian as the third, American Indian as the fourth and Hispanic as the fifth.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Sex Distribution of Lower Income Class")
ax1.set_xlabel('Sex')
ax1.set_ylabel('Frequency')
lower_income.sex.value_counts().plot(kind='bar', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Sex Distribution of Upper Income Class")
ax1.set_xlabel('Sex')
ax1.set_ylabel('Frequency')
higher_income.sex.value_counts().plot(kind='bar', grid=True)
As shown above, for both income classes, most of the people are male.
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Hour Per Week Distribution of Lower Income Class")
ax1.set_xlabel('Hour Per Week')
ax1.set_ylabel('Frequency')
lower_income['hours-per-week'].plot(kind='hist', grid=True)
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.set_title("Hour Per Week Distribution of Upper Income Class")
ax1.set_xlabel('Hour Per Week')
ax1.set_ylabel('Frequency')
higher_income['hours-per-week'].plot(kind='hist', grid=True)
As shown above, in the lower income class, most of the people work between 10 and 60 hours per week with the mode being between 30 and 40 hours. For the upper income class, most of the people work between 30 and 70 hours per week with the mode being between 30 and 40 hours.
In general, people in the upper income class are generally older and have more education than those in the lower income class. In addition, people in the upper income class are mostly married in comparison to that people in the lower income class are mostly single. In general, people in the upper income class work more hours per week than those in the lower income class.
cs_dat = pd.get_dummies(cs_dat)
cs_dat = cs_dat.drop('income_<=50K', axis=1)
labels = np.array(cs_dat['income_>50K'])
cs_dat = cs_dat.drop('income_>50K', axis=1)
col_features = cs_dat.columns
cs_dat.head(5)
cs_dat = np.array(cs_dat)
cs_dat
Naive Bayes
from sklearn import cross_validation
nbclf = naive_bayes.GaussianNB()
nbclf = nbclf.fit(cs_dat, labels)
nb_cv_scores = cross_validation.cross_val_score(nbclf, cs_dat, labels, cv=10)
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (nb_cv_scores.mean(), nb_cv_scores.std() * 2))
Deicison Tree
treelf = tree.DecisionTreeClassifier(criterion="entropy")
treeclf = treeclf.fit(cs_dat, labels)
tree_cv_scores = cross_validation.cross_val_score(treeclf, cs_dat, labels, cv=10)
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (tree_cv_scores.mean(), tree_cv_scores.std() * 2))
Linear Discriminant Analysis
from sklearn.lda import LDA
ldclf = LDA()
ldclf = ldclf.fit(cs_dat, labels)
ld_cv_scores = cross_validation.cross_val_score(ldclf, cs_dat, labels, cv=10)
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (ld_cv_scores.mean(), ld_cv_scores.std() * 2))
Overall, Linear Discriminant Analysis will be the best approach for this dataset among these three algorithms because it has the highest accuracy.
from sklearn import tree
from StringIO import StringIO
t_col = ["age","education","hours-per-week","workclass_Private","workclass_Public","workclass_Self-emp",
"marital-status_Married", "marital-status_Single", "race_Amer-Indian", "race_Asian", "race_Black",
"race_Hispanic", "race_White", "sex_Female", "sex_Male", "income_>50K"]
tree.export_graphviz(treeclf, out_file='tree.dot', feature_names=t_col)
from os import system
system('dot -Tpng tree.dot -o tree1.png')
from IPython.display import Image
Image(filename='tree1.png')